{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b6751161",
   "metadata": {},
   "source": [
    "# Chapter 4 - Probabilistic Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7b1093c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_w = pd.read_csv('mps_wiki_clean.csv')\n",
    "df_t = pd.read_csv('mps_they_clean.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bef8b9d6",
   "metadata": {},
   "source": [
    "# Extract Match and NotMatch Population"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a90f1c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Total match population, computed by cross product: 650 x 650 = 420550\n",
    "\n",
    "cross = df_w.merge(df_t, how='cross',suffixes=('_w', '_t'))\n",
    "len(cross)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "958fe6f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate exact matches as feature columns\n",
    "\n",
    "cross['Fmatch'] = (cross['Firstname_w']==cross['Firstname_t'])\n",
    "cross['Lmatch'] = (cross['Lastname_w']==cross['Lastname_t'])\n",
    "cross['Cmatch'] = (cross['Constituency_w']==cross['Constituency_t'])\n",
    "\n",
    "cross['Tmatch'] = sum([cross['Fmatch'],cross['Lmatch'],cross['Cmatch']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ca531af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract the match population as Constituency and either Firstname or Lastname matches\n",
    "\n",
    "match = cross[cross['Cmatch'] & (cross['Fmatch'] | cross['Lmatch'])]\n",
    "len(match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26dd514b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract notmatch population as either Consitutency doesn't match or Firstname and Lastname don't match (by-election)\n",
    "\n",
    "notmatch=cross[(~cross['Cmatch']) | (~cross['Fmatch'] & ~cross['Lmatch'])]\n",
    "len(notmatch)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98c9ed2e",
   "metadata": {},
   "source": [
    " # Firstname Counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64e4722d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within match population how many Firstnames match?\n",
    "\n",
    "first_match = match[match['Fmatch']]\n",
    "len(first_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5f23837",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within match population how many Firstnames don't match?\n",
    "\n",
    "notfirst_match = match[~match['Fmatch']]\n",
    "len(notfirst_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a70cc46",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within the non match population how many Firstnames match?\n",
    "\n",
    "first_notmatch = notmatch[notmatch['Fmatch']]\n",
    "len(first_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd578a95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within the non match population how many Firstnames don't match?\n",
    "\n",
    "notfirst_notmatch = notmatch[~notmatch['Fmatch']]\n",
    "len(notfirst_notmatch)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9ebabb2",
   "metadata": {},
   "source": [
    "## Firstname Match Probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f76a387c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# If Firstname matches how likely is it we have a match?\n",
    "# Calculate as number of Firstname matches within match population divided by total number of Firstname matches\n",
    "\n",
    "prob_match_first = len(first_match) / (len(first_match) + len(first_notmatch))\n",
    "prob_match_first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "211bfa1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of a match in full population\n",
    "\n",
    "prob_match = len(match) / len(cross)\n",
    "prob_match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e94c5ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of Firstname match within match population\n",
    "\n",
    "prob_first_match = len(first_match) / len(match)\n",
    "prob_first_match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b45a72d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of Firstname match within full population\n",
    "\n",
    "prob_first = len(cross[cross['Fmatch']]) / len(cross)\n",
    "prob_first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6b99c75",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of match within population of Firstname matches\n",
    "\n",
    "prob_match_first = prob_first_match * prob_match / prob_first\n",
    "prob_match_first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a06c7606",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability that Firstname doesn't match within not match population\n",
    "\n",
    "prob_first_notmatch = len(first_notmatch) / len(notmatch)\n",
    "prob_first_notmatch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc1f6ad5",
   "metadata": {},
   "outputs": [],
   "source": [
    "mf = prob_first_match\n",
    "uf = prob_first_notmatch\n",
    "lmbda = prob_match\n",
    "(lmbda * mf) / (lmbda * mf + (1-lmbda) * uf)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b61c2fa3",
   "metadata": {},
   "source": [
    "# Lastname Counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "645caa1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within match population how many Lastnames match?\n",
    "\n",
    "last_match = match[match['Lmatch']]\n",
    "len(last_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f84f187d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within match population how many Lastnames don't match?\n",
    "\n",
    "notlast_match = match[~match['Lmatch']]\n",
    "len(notlast_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d30c9ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within the non match population how many Lastnames match?\n",
    "\n",
    "last_notmatch = notmatch[notmatch['Lmatch']]\n",
    "len(last_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b16573c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Within the non match population how many Lastnames don't match?\n",
    "\n",
    "notlast_notmatch = notmatch[~notmatch['Lmatch']]\n",
    "len(notlast_notmatch)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5072055",
   "metadata": {},
   "source": [
    "## Lastname Match Probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d09dfb34",
   "metadata": {},
   "outputs": [],
   "source": [
    "last = cross[cross['Lmatch']]\n",
    "len(last)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d67c22b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# If Lastname matches how likely is it we have a match?\n",
    "# Calculate as number of Lastname matches within match population divided by total number of Lastname matches\n",
    "\n",
    "prob_match_last = len(last_match) / (len(last_match) + len(last_notmatch))\n",
    "prob_match_last"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b9154f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of Lastname match within full population\n",
    "\n",
    "prob_last = len(last)/len(cross)\n",
    "prob_last"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62e5f7bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of Lastname match within match population\n",
    "\n",
    "prob_last_match = len(last_match) / len (match)\n",
    "prob_last_match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cca1eaa6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability that Firstname doesn't match within not match population\n",
    "\n",
    "prob_last_notmatch = len (last_notmatch) / len(notmatch)\n",
    "prob_last_notmatch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7485b902",
   "metadata": {},
   "outputs": [],
   "source": [
    "ml = prob_last_match\n",
    "ul = prob_last_notmatch\n",
    "lmbda = prob_match\n",
    "(lmbda * ml) / (lmbda * ml + (1-lmbda) * ul)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2487a0d7",
   "metadata": {},
   "source": [
    "# Firstname and Lastname Counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55bc79ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "last_first_match = first_match[first_match['Lmatch']]\n",
    "len(last_first_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97daed4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "ml1 = prob_last_match\n",
    "ul1 = prob_last_notmatch\n",
    "(lmbda * ml1) / (lmbda * ml1 + (1-lmbda) * ul1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d30789c",
   "metadata": {},
   "outputs": [],
   "source": [
    "notlast_first_match = first_match[~first_match['Lmatch']]\n",
    "len(notlast_first_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ea57f27",
   "metadata": {},
   "outputs": [],
   "source": [
    "last_first_notmatch = first_notmatch[first_notmatch['Lmatch']]\n",
    "len(last_first_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f68a1dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "notlast_first_notmatch = first_notmatch[~first_notmatch['Lmatch']]\n",
    "len(notlast_first_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e373c430",
   "metadata": {},
   "outputs": [],
   "source": [
    "last_notfirst_match = notfirst_match[notfirst_match['Lmatch']]\n",
    "len(last_notfirst_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a457ac39",
   "metadata": {},
   "outputs": [],
   "source": [
    "notlast_notfirst_match = notfirst_match[~notfirst_match['Lmatch']]\n",
    "len(notlast_notfirst_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "217d13e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "last_notfirst_notmatch = notfirst_notmatch[notfirst_notmatch['Lmatch']]\n",
    "len(last_notfirst_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b736c49",
   "metadata": {},
   "outputs": [],
   "source": [
    "notlast_notfirst_notmatch = notfirst_notmatch[~notfirst_notmatch['Lmatch']]\n",
    "len(notlast_notfirst_notmatch)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ca57656",
   "metadata": {},
   "source": [
    "# Firstname and Lastname Match Probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbd4cfbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# If Firstname matches but Lastname doesn't what is the probability that it's a match?\n",
    "\n",
    "prob_match_first_notlast = len(notlast_first_match) / (len(notlast_first_match) + len(notlast_first_notmatch))\n",
    "# 4 / (4 + 2052 )\n",
    "prob_match_first_notlast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af0b455b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# If Lastname matches but Firstname doesn't what is the probability that it's a match?\n",
    "\n",
    "prob_match_notfirst_last = len(last_notfirst_match) / (len(last_notfirst_match) + len(last_notfirst_notmatch))\n",
    "# 5 / (5 + 349) \n",
    "prob_match_notfirst_last"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2f09fa1",
   "metadata": {},
   "source": [
    "# Firstname and Lastname probabilities with m and u values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e89f98b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of match if Firstname and Lastname matches\n",
    "\n",
    "(mf * ml * lmbda) / (mf * ml * lmbda + uf * ul * (1-lmbda))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d81a9789",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of match if Firstname matches but Lastname doesn't\n",
    "\n",
    "(mf * (1-ml) * lmbda) / ((mf * (1-ml) * lmbda) + (uf * (1-ul) * (1-lmbda)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42f6595d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of match if Firstname doesn't match but Lastname does\n",
    "\n",
    "((1-mf) * ml * lmbda) / (((1-mf) * ml * lmbda) + ((1-uf) * ul * (1-lmbda)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7287aae5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Probability of match if neither Firstname nor Lastname matches\n",
    "\n",
    "((1-mf) * (1-ml) * lmbda) / (((1-mf) * (1-ml) * lmbda) + ((1-uf) * (1-ul) * (1-lmbda)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f51ae185",
   "metadata": {},
   "source": [
    "# Expectation Maximisation Algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20dc7de1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helper function to calculate probability based on match features\n",
    "\n",
    "def match_prb(Fmatch,Lmatch,Cmatch,mf1,ml1,mc1,uf1,ul1,uc1, lmbda):\n",
    "    if (Fmatch==1):\n",
    "        mf = mf1\n",
    "        uf = uf1\n",
    "    else:\n",
    "        mf = (1-mf1)\n",
    "        uf = (1-uf1)\n",
    "    if (Lmatch==1):\n",
    "        ml = ml1\n",
    "        ul = ul1\n",
    "    else:\n",
    "        ml = (1-ml1)\n",
    "        ul = (1-ul1)\n",
    "    if (Cmatch==1):\n",
    "        mc = mc1\n",
    "        uc = uc1\n",
    "    else:\n",
    "        mc = (1-mc1)\n",
    "        uc = (1-uc1)\n",
    "    prob = (lmbda * ml * mf * mc) / (lmbda * ml * mf * mc + (1-lmbda) * ul * uf * uc)\n",
    "    return(prob)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07f1b6f5",
   "metadata": {},
   "source": [
    "# EM Iteration 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c63085b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select population where majority of columns match\n",
    "\n",
    "it1_match = cross[cross['Tmatch']>=2]\n",
    "it1_notmatch = cross[cross['Tmatch']<2]\n",
    "\n",
    "len(it1_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0e7ee94",
   "metadata": {},
   "outputs": [],
   "source": [
    "lmbda1 = len(it1_match)/len(cross)\n",
    "lmbda1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3421faa9",
   "metadata": {},
   "outputs": [],
   "source": [
    "it1_match[~it1_match['Fmatch'] | ~it1_match['Lmatch']][['Constituency_w', 'Firstname_w','Firstname_t','Lastname_w','Lastname_t']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67b2f904",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate m values\n",
    "\n",
    "mfi1 = len(it1_match[it1_match['Fmatch']])/len(it1_match)\n",
    "mli1 = len(it1_match[it1_match['Lmatch']])/len(it1_match)\n",
    "mci1 = len(it1_match[it1_match['Cmatch']])/len(it1_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64db8919",
   "metadata": {},
   "outputs": [],
   "source": [
    "mfi1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b36c544f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mli1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e91023c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "mci1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ad6f683",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate u values\n",
    "\n",
    "ufi1 = len(it1_notmatch[it1_notmatch['Fmatch']])/len(it1_notmatch)\n",
    "uli1 = len(it1_notmatch[it1_notmatch['Lmatch']])/len(it1_notmatch)\n",
    "uci1 = len(it1_notmatch[it1_notmatch['Cmatch']])/len(it1_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ea2edba",
   "metadata": {},
   "outputs": [],
   "source": [
    "ufi1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53269471",
   "metadata": {},
   "outputs": [],
   "source": [
    "uli1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "576c4993",
   "metadata": {},
   "outputs": [],
   "source": [
    "uci1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2bd5a344",
   "metadata": {},
   "source": [
    "# EM Iteration 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7d47eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the match probability for each combination\n",
    "\n",
    "cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,\n",
    "                                       mfi1,mli1,mci1,  \n",
    "                                        ufi1,uli1,uci1,\n",
    "                                        lmbda1), axis=1)\n",
    "\n",
    "# Set match threshold as > 0.99 probability\n",
    "\n",
    "it2_match = cross[cross['prob']>0.99]\n",
    "it2_notmatch = cross[cross['prob']<=0.99]\n",
    "len(it2_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5584f411",
   "metadata": {},
   "outputs": [],
   "source": [
    "lmbda2 = len(it2_match)/len(cross)\n",
    "lmbda2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e3acee4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select records just below the match threshold\n",
    "\n",
    "it2_notmatch[it2_notmatch['prob']>0.9][['Constituency_w', 'Lastname_w','Lastname_t','prob']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e4e9cff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recalculate m values\n",
    "\n",
    "mfi2 = len(it2_match[it2_match['Fmatch']])/len(it2_match)\n",
    "mli2 = len(it2_match[it2_match['Lmatch']])/len(it2_match)\n",
    "mci2 = len(it2_match[it2_match['Cmatch']])/len(it2_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2916e51",
   "metadata": {},
   "outputs": [],
   "source": [
    "mfi2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37321da5",
   "metadata": {},
   "outputs": [],
   "source": [
    "mli2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accf7d24",
   "metadata": {},
   "outputs": [],
   "source": [
    "mci2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18d0e72d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recalculate u values\n",
    "\n",
    "ufi2 = len(it2_notmatch[it2_notmatch['Fmatch']])/len(it2_notmatch)\n",
    "uli2 = len(it2_notmatch[it2_notmatch['Lmatch']])/len(it2_notmatch)\n",
    "uci2 = len(it2_notmatch[it2_notmatch['Cmatch']])/len(it2_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84150885",
   "metadata": {},
   "outputs": [],
   "source": [
    "ufi2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f7d92b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "uli2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "976c9ae6",
   "metadata": {},
   "outputs": [],
   "source": [
    "uci2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c4a12a0",
   "metadata": {},
   "source": [
    "# EM Iteration 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbe6ab4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the match probability for each combination\n",
    "\n",
    "cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,\n",
    "                                        mfi2,mli2,mci2,  \n",
    "                                        ufi2,uli2,uci2,\n",
    "                                        lmbda2), axis=1)\n",
    "\n",
    "# Set match threshold as > 0.99 probability\n",
    "\n",
    "it3_match = cross[cross['prob']>0.99]\n",
    "it3_notmatch = cross[cross['prob']<=0.99]\n",
    "len(it3_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f06fb6e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "lmbda3 = len(it3_match)/len(cross)\n",
    "lmbda3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbc07280",
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many records are above match threshold when either Firstname or Lastname don't match\n",
    "\n",
    "it3_match[~it3_match['Fmatch'] | ~it3_match['Lmatch']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e25cc5c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recalculate m values\n",
    "\n",
    "mfi3 = len(it3_match[it3_match['Fmatch']])/len(it3_match)\n",
    "mli3 = len(it3_match[it3_match['Lmatch']])/len(it3_match)\n",
    "mci3 = len(it3_match[it3_match['Cmatch']])/len(it3_match)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97962b9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "mfi3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b37029ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "mli3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6039a2e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "mci3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79f28ca1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recalculate u values\n",
    "\n",
    "ufi3 = len(it3_notmatch[it3_notmatch['Fmatch']])/len(it3_notmatch)\n",
    "uli3 = len(it3_notmatch[it3_notmatch['Lmatch']])/len(it3_notmatch)\n",
    "uci3 = len(it3_notmatch[it3_notmatch['Cmatch']])/len(it3_notmatch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "548587ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "ufi3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c766100",
   "metadata": {},
   "outputs": [],
   "source": [
    "uli3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8126da29",
   "metadata": {},
   "outputs": [],
   "source": [
    "uci3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d5b9150",
   "metadata": {},
   "source": [
    "# Splink"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "128f3a60",
   "metadata": {},
   "outputs": [],
   "source": [
    "#%pip install splink\n",
    "# After install you may need to restart the kernel and reload datasets in cell 1.\n",
    "\n",
    "import splink"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82620bb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add a unique_id column needed by Splink to both datasets\n",
    "\n",
    "df_w['unique_id'] = df_w.index\n",
    "df_t['unique_id'] = df_t.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db53661a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['Flink'] = None\n",
    "df_t['Notes'] = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93df80f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w = df_w[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]\n",
    "df_t = df_t[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2bd4424",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splink settings to match on Firstname and Lastname\n",
    "\n",
    "from splink.duckdb.linker import DuckDBLinker\n",
    "from splink.duckdb import comparison_library as cl\n",
    "\n",
    "settings = {\n",
    "    \"link_type\": \"link_only\",\n",
    "    \"comparisons\": [\n",
    "        cl.exact_match(\"Firstname\"),\n",
    "        cl.exact_match(\"Lastname\"),\n",
    "        cl.exact_match(\"Constituency\"),\n",
    "    ],\n",
    "#    \"retain_intermediate_calculation_columns\" : True,\n",
    "#    \"retain_matching_columns\" : True,  \n",
    "}\n",
    "linker = DuckDBLinker([df_w, df_t], settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ac2df06",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Examine distribution of columns to be matched\n",
    "\n",
    "linker.profile_columns(['Firstname','Lastname','Constituency'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3a3905f",
   "metadata": {},
   "outputs": [],
   "source": [
    "em_session = linker.estimate_parameters_using_expectation_maximisation('True', fix_u_probabilities=False, fix_probability_two_random_records_match=False, populate_probability_two_random_records_match_from_trained_values=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "369fe1ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "#linker.save_model_to_json(\"Chapter4_Splink_Settings.json\", overwrite=True)\n",
    "linker.load_settings(\"Chapter4_Splink_Settings.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "058045ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "em_session.match_weights_interactive_history_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95170543",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate predictions, apply probability threshold of 0.9 and convert to dataframe\n",
    "\n",
    "pres = linker.predict(threshold_match_probability = 0.99).as_pandas_dataframe()\n",
    "len(pres)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df9ee1c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "pres[pres['match_probability']<0.999]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "515c6de8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#linker.waterfall_chart(pres[pres['match_probability']<0.999].to_dict(orient=\"records\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6a6973c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# At Threshold = 0.99, Splink False Negatives\n",
    "\n",
    "m_outer = match.merge(pres, left_on=['Constituency_t'], right_on=['Constituency_l'], how='outer')\n",
    "m_outer[m_outer['Constituency_t']!=m_outer['Constituency_l']][['Constituency_w','Lastname_w','Lastname_t']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05d42a59",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
